Load regions and spatial FIPS
#load health officer regions
healthofficer_shortname <- tribble(~region_acronym, ~region_long, ~ region_full,
"ABAHO", "Association of Bay Area Health Officers (ABAHO)", "Association of Bay Area Health Officers (ABAHO)",
"GSRHO", "Greater Sacramento Region Health Officers", "Greater Sacramento Region Health Officers (GSRHO)",
"RANCHO", "Rural Association of Northern California Health Officers (RANCHO)", "Rural Association of Northern California Health Officers (RANCHO)",
"SJVC", "San Joaquin Valley Consortium (SJVC)", "San Joaquin Valley Consortium (SJVC)",
"SCAL", "Southern California Health Officers", "Southern California Health Officers (SCAL)"
)
regions <- read_csv("data/CAregions.csv") %>%
select( county, region_healthofficer , dof_pop_county) %>%
rename( region_long=region_healthofficer) %>%
left_join( healthofficer_shortname, by = "region_long") %>% na.omit()
#load data for spatial plotting
spdf <- geojsonsf::geojson_sf("data/counties.geojson")
county_fips<-data.frame(county= spdf$NAME_1, county_fips= spdf$COUNTYFI_1)
Standardized color palette and labels for plotting
MAE_map_col<-viridis_pal()(6)
# MAE_map_col<-rev(map_col[1:10])
names(MAE_map_col) <- c("columbia", "covid_nearterm", "m.proj", "lemma", "simple", "CA-baseline")
gglabels_MAE<-c("columbia"="Columbia", "covid_nearterm"= "COVID Near-Term", "m.proj"="Ensemble", "lemma"="Lemma", "simple"="Simple Growth", "CA-baseline"="CA-baseline")
forecaster_map_col<-viridis_pal()(6)
# MAE_map_col<-rev(map_col[1:10])
names(forecaster_map_col) <- c("Columbia", "COVID Nearterm", "Ensemble", "LEMMA", "Simple Growth", "CA-baseline")
region_col<- met.brewer("Hokusai1", 5)
names(region_col)<-c("Association of Bay Area Health Officers (ABAHO)", "Greater Sacramento Region Health Officers (GSRHO)", "Rural Association of Northern California Health Officers (RANCHO)", "San Joaquin Valley Consortium (SJVC)", "Southern California Health Officers (SCAL)")
region_col_abr<- met.brewer("Hokusai1", 5)
names(region_col_abr)<-c("ABAHO", "GSRHO", "RANCHO", "SJVC", "SCAL")
num_forecasts_col<-met.brewer("Hiroshige", 5)
Figure 1. Plot hospitalizations and variant prevalence to identify periods of interest
#define start and end dates for analysis period
alpha_start<- as.Date("2021-04-21")
alpha_end<- as.Date("2021-06-01")
delta_start<- as.Date("2021-07-21")
delta_end<- as.Date("2021-09-01")
omicron_start<- as.Date("2021-12-21")
omicron_end<- as.Date("2022-02-01")
variant_dates <- tribble(
~variant_period, ~start_date, ~end_date,
"alpha_frac", alpha_start, alpha_end,
"delta_frac", delta_start, delta_end,
"omicron_frac", omicron_start, omicron_end
)
#define color palette and labels for variants
variant_cols <- met.brewer("Greek", 5)
names(variant_cols) <- c("alpha_frac", "delta_frac", "gamma_frac",
"omicron_frac", "other_frac")
variant_labels <- c("alpha_frac"= "Alpha", "delta_frac"= "Delta" , "gamma_frac"= "Gamma",
"omicron_frac"="Omicron", "other_frac"="Other")
#actual hospitalizations
actuals <- fread("data/COVID_hospital_census.csv") %>% mutate(index = mdy( Most.Recent.Date)) %>% rename(county=County.Name) %>% left_join(regions, by= "county") %>%
group_by( index ) %>% select(county, region=region_acronym, index, hospitalized= COVID.19.Positive.Patients)
actuals_state<- actuals %>% group_by( index ) %>% summarize(hospitalized= sum(hospitalized, na.rm=TRUE))
A<-actuals_state %>% filter(index> as.Date("2021-02-01"), index<as.Date("2022-02-01")) %>% ggplot() + geom_line(aes(x=index, y=hospitalized)) + theme_classic()+ xlab("Date")+ ylab("Hospitalization Census")+
geom_rect(aes(xmin = start_date, xmax = end_date, fill = variant_period),
ymin = -Inf, ymax = Inf, alpha = 0.3,
data = variant_dates)+
scale_fill_manual(name="Analysis\nPeriod", labels=variant_labels[names(variant_cols) %in% unique(variant_dates$variant_period)], values=variant_cols[names(variant_cols) %in% unique(variant_dates$variant_period)] )
variants <- fread("data/region_variants_summary.csv") %>% mutate(index=as.Date(index))
variants_state<- variants %>% filter(region== "California", index> as.Date("2021-02-01") & index < as.Date("2022-02-01")) %>% select(index, region, alpha_frac, gamma_frac, omicron_frac, delta_frac, other_frac) %>% pivot_longer(!c(region, index), names_to = "Variant", values_to = "fraction")
B<- variants_state %>% ggplot()+ geom_line(aes(x=index, y=fraction, col=Variant))+ xlab("Date")+ ylab("Variant Prevalence") + theme_classic()+
scale_color_manual(values=variant_cols, labels = variant_labels)+
# scale_color_manual(values=met.brewer("Greek", 5), labels = c("alpha_frac"= "Alpha", "delta_frac"= "Delta" , "gamma_frac"= "Gamma",
# "omicron_frac"="Omicron", "other_frac"="Other"))+
geom_rect(data = variant_dates, aes(xmin = start_date, xmax = end_date, fill = variant_period), ymin = -Inf, ymax = Inf, alpha = 0.3)+ scale_fill_manual(name="Analysis\nPeriod", labels=variant_labels[names(variant_cols) %in% unique(variant_dates$variant_period)], values=variant_cols[names(variant_cols) %in% unique(variant_dates$variant_period)] )+
guides(fill="none")
reff_state <- read_csv("data/state_reff.csv") %>%
filter(model == "m.rt") %>% mutate( location = "CA", location_level= "state")
C<- reff_state %>% filter(index> as.Date("2021-02-01"), index< as.Date("2022-02-01")) %>% ggplot()+ geom_line(aes(x=index, y=estimate))+ geom_hline(yintercept=1, lty="dashed")+ xlab("Date")+ ylab("R-effective Estimate") + theme_classic()+
scale_color_manual(values=variant_cols, labels = variant_labels)+
geom_rect(data = variant_dates, aes(xmin = start_date, xmax = end_date, fill = variant_period), ymin = -Inf, ymax = Inf, alpha = 0.3)+ scale_fill_manual(name="Analysis\nPeriod", labels=variant_labels[names(variant_cols) %in% unique(variant_dates$variant_period)], values=variant_cols[names(variant_cols) %in% unique(variant_dates$variant_period)] )+
guides(fill="none")
col_one<- cowplot::plot_grid(A, B, C, labels="AUTO", ncol=1, align="hv", axis="lr")
D<- regions %>% left_join(county_fips, by="county") %>%
left_join(urbnmapr::counties, by = "county_fips") %>%
filter(state_name =="California") %>%
ggplot(mapping = aes(long, lat, group = group, fill = region_full)) +
geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Health Officer Regions", values= region_col)+
coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
guides(fill = guide_legend(title.position="top", nrow = 6)) +
theme_classic()+
xlab(NULL)+ ylab(NULL)+
theme(axis.line = element_blank(),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
legend.position="bottom")
cowplot::plot_grid(col_one, D, labels=c("", "D"), ncol=2, rel_widths=c(2, 1))

Import and Clean Socio-Economic Data Sets
CA county data sets from: https://www.ers.usda.gov/data-products/county-level-data-sets/
- Poverty (2019): All people in poverty (2019) %
- Unemployment(annual average 2020 unemployment and 2019 median income latest): Unemployment rate (%) from 2019; Median Household Income
- Education (2015-2019): 2015-2019 5-yr average percentage completing college (university degree); 2013 Rural-urban Continuum code
#Socio-economic county-level data
#daily vax record
vx_odp_raw <-read.csv("https://data.chhs.ca.gov/dataset/e283ee5a-cf18-4f20-a92c-ee94a2866ccd/resource/130d7ba2-b6eb-438d-a412-741bde207e1c/download/covid19vaccinesbycounty.csv") %>% mutate(index = as.Date(administered_date)+ 14) #Add two week lag beyond administration date to get at "fully vaccinated" status
vx <- vx_odp_raw %>% arrange(index, county) %>% filter(index>=as.Date("2021-01-01"), county %notin% c("All CA and Non-CA Counties", "All CA Counties", "Unknown")) %>% left_join(regions) %>%
mutate(perc_COVIDvx= cumulative_fully_vaccinated/dof_pop_county) %>% select(index, county, perc_COVIDvx)
poverty<- read_xlsx("data/PovertyReport.xlsx", skip=4) %>% select(county= Name, RUC_Code=`RUC Code`, perc_pov= `Percent...7`) %>% filter(county!= "California")
unemploy<-read_xlsx("data/UnemploymentReport.xlsx", skip=2) %>% select(county= Name, perc_unemploy= `2020`, income=`Median Household Income (2019)`) %>% filter(county!= "California") %>% mutate(county = gsub(" County, CA", "", county), county = gsub(" County/city, CA", "", county))
edu<-read_xlsx("data/EducationReport.xlsx", skip=2) %>% select(county= Name, education=`2015-2019`) %>% filter(county!= "California") %>% mutate(county = gsub(", CA", "", county))
reff_curve_cnty <- fread("data/cnty_reff.csv") %>%
filter(model=="m.rt") %>% select(-model, -archivedt) %>% mutate(index=as.Date(index))
reff_dates <- seq(as.Date(as.Date("2021-01-01")), max(reff_curve_cnty$index, na.rm = T), by = "1 day")
reff_change <- bind_rows(lapply(reff_dates, function(t){
reff_diff <- reff_curve_cnty %>% filter(index %in% c(t, t - lubridate::days(7))) %>%
group_by(county) %>%
summarize(low = first(estimate), high = last(estimate)) %>%
mutate(diff = high - low, index = t)
})) %>% select(index, county, diff)
7 days
Score model performance using fractional ranking
score_cards_tourney <-fread("results/score_cards_2021Jan1_2022Feb1_allMAE_baseline.csv") %>% filter(model %in% c("columbia", "lemma", "m.proj", "covid_nearterm", "simple", "CA-baseline"), geo_value %notin% c("Alpine", "Sierra", "Sutter")) %>% rename(county=geo_value) %>% mutate(index=as.Date(forecast_date), target_end_date=as.Date(target_end_date)) %>% filter(forecast_date>="2021-02-01", target_end_date <=omicron_end)
#Update MAE definitions here as needed (MAE_7day, MAE_14day, MAE_21day, MAE_28day) and filter by appropriate ahead/delay (e.g., 7, 14, 21, 28)
delay<-7
MAE_ranks_long <- score_cards_tourney %>% filter(ahead <=delay) %>% group_by(county, index) %>% select(index, county, model, forecaster, MAE=MAE_7day, norm_MAE=norm_MAE_7day) %>% distinct() %>% mutate(MAE_rank=dense_rank(desc(-MAE)), norm_MAE_rank=dense_rank(desc(-norm_MAE)))
max_models<-MAE_ranks_long %>% group_by(county, index) %>% summarize(models_per_day=max(norm_MAE_rank, na.rm=TRUE))
fractional_ranks<- MAE_ranks_long %>% left_join(max_models, by=c("county", "index")) %>% mutate(fraction_rank= 1- (norm_MAE_rank-1)/models_per_day)
daily_winner<-fractional_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
county_sums<-fractional_ranks %>% group_by(county, model) %>% summarize(county_sum= sum(fraction_rank, na.rm=TRUE))
county_sums_wide<-county_sums %>% pivot_wider(names_from="model", values_from = "county_sum")
alpha_ranks <- fractional_ranks %>% filter(index > as.Date(alpha_start) & index < as.Date(alpha_end))
alpha_daily_winner<-alpha_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
alpha_county_by_model <-alpha_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
alpha_winner<- alpha_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
alpha_model_count<- alpha_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
alpha_date_range<-length(unique(alpha_ranks$index))
num_counties<-length(unique(alpha_ranks$county))
alpha_model_sum<-alpha_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(alpha_model_count, by="model") %>%
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)
delta_ranks <- fractional_ranks %>% filter(index > as.Date(delta_start) & index < as.Date(delta_end))
delta_daily_winner<-delta_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
delta_county_by_model <-delta_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
delta_winner<- delta_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
delta_model_count<- delta_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
delta_date_range<-length(unique(delta_ranks$index))
num_counties<-length(unique(delta_ranks$county))
delta_model_sum<-delta_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(delta_model_count, by="model") %>%
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)
omicron_ranks <- fractional_ranks %>% filter(index > as.Date(omicron_start) & index < as.Date(omicron_end-delay))
omicron_daily_winner<-omicron_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
omicron_county_by_model <-omicron_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
omicron_winner<- omicron_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
omicron_model_count<- omicron_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
omicron_date_range<-length(unique(omicron_ranks$index))
num_counties<-length(unique(omicron_ranks$county))
omicron_model_sum<-omicron_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(omicron_model_count, by="model") %>%
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)
Alpha period: Plot maps of best performing models by county
MAE_map_alpha<-alpha_winner %>% left_join(county_fips, by="county") %>%
left_join(urbnmapr::counties, by = "county_fips") %>%
filter(state_name =="California") %>%
ggplot(mapping = aes(long, lat, group = group, fill = model)) +
geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_winner$model)] )+
coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
guides(fill = "none") +
theme_classic()+
xlab(NULL)+ ylab(NULL)+
theme(axis.line = element_blank(),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
#version 1- histogram of winning models by county
MAE_hist_v2<-alpha_winner %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_winner$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Count")+
# guides(fill = "none") +
theme_classic()
#version 2- histogram of normalized ranking scores
MAE_hist<-alpha_model_sum %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_model_sum$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Normalized Rank")+
# guides(fill = "none") +
theme_classic()
MAE_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=model))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_daily_winner$model)] )+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
# labs(fill = "Model:") +
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
participants_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of/Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
guides(fill="none")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
MAE_rankings_alpha<-alpha_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() +
geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+
geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+
facet_grid(forecaster ~.)+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ guides(fill="none")+ theme(strip.text = element_blank())
col2<- cowplot::plot_grid(MAE_map_alpha, MAE_rankings_alpha, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_alpha, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

alpha_winner %>% ungroup() %>% count(model) %>% kable
| CA-baseline |
33 |
| columbia |
3 |
| lemma |
7 |
| m.proj |
8 |
| simple |
5 |
alpha_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
| LEMMA |
0.6000000 |
0.6582263 |
| Columbia |
0.2000000 |
0.3995694 |
| Simple Growth |
0.6000000 |
0.6149167 |
| Ensemble |
0.6000000 |
0.5877652 |
| COVID Nearterm |
0.6666667 |
0.7301235 |
| CA-baseline |
0.8000000 |
0.8021667 |
Whole Delta period: Plot maps of best performing models by county for
MAE_map_delta<-delta_winner %>% left_join(county_fips, by="county") %>%
left_join(urbnmapr::counties, by = "county_fips") %>%
filter(state_name =="California") %>%
ggplot(mapping = aes(long, lat, group = group, fill = model)) +
geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_winner$model)] )+
coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
guides(fill = "none") +
theme_classic()+
xlab(NULL)+ ylab(NULL)+
theme(axis.line = element_blank(),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
#version 1- histogram of winning models by county
MAE_hist<-delta_winner %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_winner$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Count")+
guides(fill = "none") +
theme_classic()
#version 2- histogram of normalized ranking scores
MAE_hist<-delta_model_sum %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_model_sum$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Normalized Rank")+
guides(fill = "none") +
theme_classic()
MAE_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=model))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_daily_winner$model)] )+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
labs(fill = "Model:") +
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
participants_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of/Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
labs(fill = "Model:") +
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
MAE_rankings_delta<-delta_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() +
geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+
geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+
facet_grid(forecaster ~.)+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ guides(fill="none")+ theme(strip.text = element_blank())
col2<- cowplot::plot_grid(MAE_map_delta, MAE_rankings_delta, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_delta, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

delta_winner %>% ungroup() %>% count(model) %>% kable
| CA-baseline |
11 |
| columbia |
2 |
| covid_nearterm |
16 |
| lemma |
7 |
| m.proj |
13 |
| simple |
6 |
delta_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
| LEMMA |
0.6000000 |
0.6046805 |
| Columbia |
0.2500000 |
0.4140758 |
| COVID Nearterm |
0.8333333 |
0.7461538 |
| Simple Growth |
0.6000000 |
0.5923356 |
| Ensemble |
0.6666667 |
0.6465188 |
| CA-baseline |
0.6666667 |
0.6640650 |
Omicron: Plot maps of best performing models by count
MAE_map_omicron<-omicron_winner %>% left_join(county_fips, by="county") %>%
left_join(urbnmapr::counties, by = "county_fips") %>%
filter(state_name =="California") %>%
ggplot(mapping = aes(long, lat, group = group, fill = model)) +
geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_winner$model)] )+
coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
guides(fill = "none") +
theme_classic()+
xlab(NULL)+ ylab(NULL)+
theme(axis.line = element_blank(),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
#version 1- histogram of winning models by county
MAE_hist<-omicron_winner %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_winner$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Count")+
guides(fill = "none") +
theme_classic()
#version 2- histogram of normalized ranking scores
MAE_hist<-omicron_model_sum %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_model_sum$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Normalized Rank")+
guides(fill = "none") +
theme_classic()
MAE_heatmap_omicron<-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=model))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_daily_winner$model)] )+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
guides(fill=guide_legend(title.position="left", nrow=1))+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
participants_heatmap_omicron <-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
guides(fill=guide_legend(title.position="left", nrow=2))+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
MAE_rankings_omicron<-omicron_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>% drop_na() %>%
mutate(med = median(fraction_rank, na.rm=TRUE), avg= mean(fraction_rank, na.rm=TRUE)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() +
geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+
geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+
facet_grid(forecaster ~., scale="free")+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(omicron_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(omicron_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+
guides(fill="none")+
theme(strip.text = element_blank())
# row1<-cowplot::plot_grid(participants_heatmap_omicron, MAE_heatmap_omicron, labels=c("A", "B"), ncol=2, align="hv", axis = "tb")
# row2<-cowplot::plot_grid(MAE_map_omicron, MAE_rankings_omicron, labels=c("C", "D"), ncol=2, align="hv", axis = "l")
# cowplot::plot_grid(row1, row2, ncol=1, align="hv", axis= "tb", rel_heights = c(2.5,1))
# cowplot::plot_grid(MAE_map_omicron, MAE_hist, ncol=2, axis = "l")
col2<- cowplot::plot_grid(MAE_map_omicron, MAE_rankings_omicron, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_omicron, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

# cowplot::plot_grid(participants_heatmap_omicron, MAE_heatmap_omicron, col2, ncol=3, labels=c("A", "B", ""), axis = "l", rel_widths = c(1,1, .5))
omicron_winner %>% ungroup() %>% count(model) %>% kable
| CA-baseline |
17 |
| covid_nearterm |
14 |
| m.proj |
11 |
| simple |
13 |
omicron_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
| Columbia |
NA |
NA |
| COVID Nearterm |
0.7500000 |
0.7224925 |
| Simple Growth |
0.6666667 |
0.5869519 |
| Ensemble |
0.5000000 |
0.5987611 |
| LEMMA |
0.8000000 |
0.7377133 |
| CA-baseline |
0.7500000 |
0.6886453 |
Number of forecasts participants by date
participants_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
#guides(fill="none")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
legend <- cowplot::get_legend(
# create some space to the left of the legend
participants_heatmap_alpha
)
participants_heatmap_alpha<- participants_heatmap_alpha+ guides(fill="none")
participants_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of/Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("")+
guides(fill="none")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
participants_heatmap_omicron <-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("")+
# guides(fill=guide_legend(title.position="left", nrow=2))+
guides(fill="none")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "right")#+ ggtitle("Best daily performing model by county as measured by MAE")
# cowplot::plot_grid(participants_heatmap_alpha, participants_heatmap_delta, participants_heatmap_omicron, labels="AUTO", ncol=3, align="hv", axis = "tb")
row1<- cowplot::plot_grid(participants_heatmap_alpha, participants_heatmap_delta, participants_heatmap_omicron, labels="AUTO", ncol=3, align="hv", axis = "tb")
cowplot::plot_grid(row1, legend, ncol=1, rel_heights=c(3,0.5))

Train Random Forest Classification
## Random Forest
##
## 14336 samples
## 13 predictor
## 6 classes: 'simple', 'lemma', 'CA-baseline', 'm.proj', 'covid_nearterm', 'columbia'
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (4 fold, repeated 4 times)
## Summary of sample sizes: 10753, 10752, 10752, 10751, 10752, 10752, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.5936809 0.4791519
## 7 0.6082419 0.5010488
## 13 0.6032020 0.4946206
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.

## rf variable importance
##
## Overall
## perc_COVIDvx 100.000
## r_eff 76.989
## diff 72.680
## delta_frac 36.048
## pop_size 33.101
## other_frac 26.019
## alpha_frac 23.536
## gamma_frac 15.309
## omicron_frac 6.161
## income 3.624
## perc_unemploy 1.937
## education 1.583
## perc_pov 0.000
14 days
Score model performance using fractional ranking
#Update MAE definitions here as needed (MAE_7day, MAE_14day, MAE_21day, MAE_28day) and filter by appropriate ahead/delay (e.g., 7, 14, 21, 28)
delay<-14
MAE_ranks_long <- score_cards_tourney %>% filter(ahead <=delay) %>% group_by(county, index) %>% select(index, county, model, forecaster, MAE=MAE_14day, norm_MAE=norm_MAE_14day) %>% distinct() %>% mutate(MAE_rank=dense_rank(desc(-MAE)), norm_MAE_rank=dense_rank(desc(-norm_MAE)))
max_models<-MAE_ranks_long %>% group_by(county, index) %>% summarize(models_per_day=max(norm_MAE_rank, na.rm=TRUE))
fractional_ranks<- MAE_ranks_long %>% left_join(max_models, by=c("county", "index")) %>% mutate(fraction_rank= 1- (norm_MAE_rank-1)/models_per_day)
daily_winner<-fractional_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
county_sums<-fractional_ranks %>% group_by(county, model) %>% summarize(county_sum= sum(fraction_rank, na.rm=TRUE))
county_sums_wide<-county_sums %>% pivot_wider(names_from="model", values_from = "county_sum")
alpha_ranks <- fractional_ranks %>% filter(index > as.Date(alpha_start) & index < as.Date(alpha_end))
alpha_daily_winner<-alpha_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
alpha_county_by_model <-alpha_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
alpha_winner<- alpha_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
alpha_model_count<- alpha_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
alpha_date_range<-length(unique(alpha_ranks$index))
num_counties<-length(unique(alpha_ranks$county))
alpha_model_sum<-alpha_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(alpha_model_count, by="model") %>%
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)
delta_ranks <- fractional_ranks %>% filter(index > as.Date(delta_start) & index < as.Date(delta_end))
delta_daily_winner<-delta_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
delta_county_by_model <-delta_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
delta_winner<- delta_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
delta_model_count<- delta_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
delta_date_range<-length(unique(delta_ranks$index))
num_counties<-length(unique(delta_ranks$county))
delta_model_sum<-delta_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(delta_model_count, by="model") %>%
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)
omicron_ranks <- fractional_ranks %>% filter(index > as.Date(omicron_start) & index < as.Date(omicron_end-delay))
omicron_daily_winner<-omicron_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
omicron_county_by_model <-omicron_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
omicron_winner<- omicron_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
omicron_model_count<- omicron_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
omicron_date_range<-length(unique(omicron_ranks$index))
num_counties<-length(unique(omicron_ranks$county))
omicron_model_sum<-omicron_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(omicron_model_count, by="model") %>%
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)
Alpha period: Plot maps of best performing models by county
MAE_map_alpha<-alpha_winner %>% left_join(county_fips, by="county") %>%
left_join(urbnmapr::counties, by = "county_fips") %>%
filter(state_name =="California") %>%
ggplot(mapping = aes(long, lat, group = group, fill = model)) +
geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_winner$model)] )+
coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
guides(fill = "none") +
theme_classic()+
xlab(NULL)+ ylab(NULL)+
theme(axis.line = element_blank(),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
#version 1- histogram of winning models by county
MAE_hist_v2<-alpha_winner %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_winner$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Count")+
# guides(fill = "none") +
theme_classic()
#version 2- histogram of normalized ranking scores
MAE_hist<-alpha_model_sum %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_model_sum$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Normalized Rank")+
# guides(fill = "none") +
theme_classic()
MAE_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=model))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_daily_winner$model)] )+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
labs(fill = "Model:") +
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
participants_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of/Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
guides(fill="none")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
MAE_rankings_alpha<-alpha_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() +
geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+
geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+
facet_grid(forecaster ~.)+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ guides(fill="none")+ theme(strip.text = element_blank())
#cowplot::plot_grid(MAE_map, MAE_hist, ncol=2, axis = "l")
# cowplot::plot_grid(map14, day14_hist, ncol=2, axis = "l")
# cowplot::plot_grid(map28, day28_hist, ncol=2, axis = "l")
col2<- cowplot::plot_grid(MAE_map_alpha, MAE_rankings_alpha, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_alpha, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

alpha_winner %>% ungroup() %>% count(model) %>% kable
| CA-baseline |
30 |
| columbia |
4 |
| lemma |
9 |
| m.proj |
8 |
| simple |
4 |
alpha_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
| LEMMA |
0.6000000 |
0.6634190 |
| Columbia |
0.2000000 |
0.4125040 |
| Simple Growth |
0.6000000 |
0.5912348 |
| Ensemble |
0.6000000 |
0.6119545 |
| COVID Nearterm |
0.6666667 |
0.6965432 |
| CA-baseline |
0.8000000 |
0.7869697 |
Whole Delta period: Plot maps of best performing models by county for
MAE_map_delta<-delta_winner %>% left_join(county_fips, by="county") %>%
left_join(urbnmapr::counties, by = "county_fips") %>%
filter(state_name =="California") %>%
ggplot(mapping = aes(long, lat, group = group, fill = model)) +
geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_winner$model)] )+
coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
guides(fill = "none") +
theme_classic()+
xlab(NULL)+ ylab(NULL)+
theme(axis.line = element_blank(),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
#version 1- histogram of winning models by county
MAE_hist<-delta_winner %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_winner$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Count")+
guides(fill = "none") +
theme_classic()
#version 2- histogram of normalized ranking scores
MAE_hist<-delta_model_sum %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_model_sum$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Normalized Rank")+
guides(fill = "none") +
theme_classic()
MAE_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=model))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_daily_winner$model)] )+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
labs(fill = "Model:") +
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
participants_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of/Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
labs(fill = "Model:") +
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
MAE_rankings_delta<-delta_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() +
geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+
geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+
facet_grid(forecaster ~.)+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ guides(fill="none")+ theme(strip.text = element_blank())
col2<- cowplot::plot_grid(MAE_map_delta, MAE_rankings_delta, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_delta, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

delta_winner %>% ungroup() %>% count(model) %>% kable
| CA-baseline |
12 |
| columbia |
2 |
| covid_nearterm |
6 |
| lemma |
10 |
| m.proj |
15 |
| simple |
10 |
delta_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
| LEMMA |
0.6666667 |
0.6319085 |
| Columbia |
0.3333333 |
0.4152576 |
| COVID Nearterm |
0.6666667 |
0.6815510 |
| Simple Growth |
0.6000000 |
0.5900665 |
| Ensemble |
0.6666667 |
0.6645528 |
| CA-baseline |
0.6666667 |
0.6488544 |
Omicron: Plot maps of best performing models by count
MAE_map_omicron<-omicron_winner %>% left_join(county_fips, by="county") %>%
left_join(urbnmapr::counties, by = "county_fips") %>%
filter(state_name =="California") %>%
ggplot(mapping = aes(long, lat, group = group, fill = model)) +
geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_winner$model)] )+
coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
guides(fill = "none") +
theme_classic()+
xlab(NULL)+ ylab(NULL)+
theme(axis.line = element_blank(),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
#version 1- histogram of winning models by county
MAE_hist<-omicron_winner %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_winner$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Count")+
guides(fill = "none") +
theme_classic()
#version 2- histogram of normalized ranking scores
MAE_hist<-omicron_model_sum %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_model_sum$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Normalized Rank")+
guides(fill = "none") +
theme_classic()
MAE_heatmap_omicron<-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=model))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_daily_winner$model)] )+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
guides(fill=guide_legend(title.position="left", nrow=1))+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
participants_heatmap_omicron <-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
guides(fill=guide_legend(title.position="left", nrow=2))+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
MAE_rankings_omicron<-omicron_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>% drop_na() %>%
mutate(med = median(fraction_rank, na.rm=TRUE), avg= mean(fraction_rank, na.rm=TRUE)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() +
geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+
geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+
facet_grid(forecaster ~., scale="free")+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(omicron_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(omicron_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+
guides(fill="none")+
theme(strip.text = element_blank())
# row1<-cowplot::plot_grid(participants_heatmap_omicron, MAE_heatmap_omicron, labels=c("A", "B"), ncol=2, align="hv", axis = "tb")
# row2<-cowplot::plot_grid(MAE_map_omicron, MAE_rankings_omicron, labels=c("C", "D"), ncol=2, align="hv", axis = "l")
# cowplot::plot_grid(row1, row2, ncol=1, align="hv", axis= "tb", rel_heights = c(2.5,1))
# cowplot::plot_grid(MAE_map_omicron, MAE_hist, ncol=2, axis = "l")
col2<- cowplot::plot_grid(MAE_map_omicron, MAE_rankings_omicron, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_omicron, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

# cowplot::plot_grid(participants_heatmap_omicron, MAE_heatmap_omicron, col2, ncol=3, labels=c("A", "B", ""), axis = "l", rel_widths = c(1,1, .5))
omicron_winner %>% ungroup() %>% count(model) %>% kable
| CA-baseline |
17 |
| covid_nearterm |
15 |
| m.proj |
17 |
| simple |
6 |
omicron_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
| Columbia |
NA |
NA |
| COVID Nearterm |
0.7500000 |
0.7259497 |
| Simple Growth |
0.6000000 |
0.5638047 |
| Ensemble |
0.6666667 |
0.6279349 |
| LEMMA |
0.8000000 |
0.7257073 |
| CA-baseline |
0.6666667 |
0.6711672 |
Number of forecasts participants by date
participants_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
#guides(fill="none")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
legend <- cowplot::get_legend(
# create some space to the left of the legend
participants_heatmap_alpha
)
participants_heatmap_alpha<- participants_heatmap_alpha+ guides(fill="none")
participants_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of/Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("")+
guides(fill="none")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
participants_heatmap_omicron <-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("")+
# guides(fill=guide_legend(title.position="left", nrow=2))+
guides(fill="none")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "right")#+ ggtitle("Best daily performing model by county as measured by MAE")
# cowplot::plot_grid(participants_heatmap_alpha, participants_heatmap_delta, participants_heatmap_omicron, labels="AUTO", ncol=3, align="hv", axis = "tb")
row1<- cowplot::plot_grid(participants_heatmap_alpha, participants_heatmap_delta, participants_heatmap_omicron, labels="AUTO", ncol=3, align="hv", axis = "tb")
cowplot::plot_grid(row1, legend, ncol=1, rel_heights=c(3,0.5))

Train Random Forest Classification
## Random Forest
##
## 14041 samples
## 13 predictor
## 6 classes: 'simple', 'lemma', 'CA-baseline', 'm.proj', 'covid_nearterm', 'columbia'
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (4 fold, repeated 4 times)
## Summary of sample sizes: 10531, 10531, 10531, 10530, 10532, 10532, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.6497753 0.5517726
## 7 0.6630225 0.5713979
## 13 0.6559360 0.5622639
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.

## rf variable importance
##
## Overall
## perc_COVIDvx 100.0000
## r_eff 73.8465
## diff 65.4926
## pop_size 34.3191
## delta_frac 30.2233
## other_frac 21.5257
## alpha_frac 21.4440
## gamma_frac 11.5425
## income 2.6647
## perc_unemploy 1.5299
## education 1.3116
## omicron_frac 0.5795
## perc_pov 0.0000
21 days
Score model performance using fractional ranking
#Update MAE definitions here as needed (MAE_7day, MAE_14day, MAE_21day, MAE_28day) and filter by appropriate ahead/delay (e.g., 7, 14, 21, 28)
delay<-21
MAE_ranks_long <- score_cards_tourney %>% filter(ahead <=delay) %>% group_by(county, index) %>% select(index, county, model, forecaster, MAE=MAE_21day, norm_MAE=norm_MAE_21day) %>% distinct() %>% mutate(MAE_rank=dense_rank(desc(-MAE)), norm_MAE_rank=dense_rank(desc(-norm_MAE)))
max_models<-MAE_ranks_long %>% group_by(county, index) %>% summarize(models_per_day=max(norm_MAE_rank, na.rm=TRUE))
fractional_ranks<- MAE_ranks_long %>% left_join(max_models, by=c("county", "index")) %>% mutate(fraction_rank= 1- (norm_MAE_rank-1)/models_per_day)
daily_winner<-fractional_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
county_sums<-fractional_ranks %>% group_by(county, model) %>% summarize(county_sum= sum(fraction_rank, na.rm=TRUE))
county_sums_wide<-county_sums %>% pivot_wider(names_from="model", values_from = "county_sum")
alpha_ranks <- fractional_ranks %>% filter(index > as.Date(alpha_start) & index < as.Date(alpha_end))
alpha_daily_winner<-alpha_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
alpha_county_by_model <-alpha_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
alpha_winner<- alpha_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
alpha_model_count<- alpha_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
alpha_date_range<-length(unique(alpha_ranks$index))
num_counties<-length(unique(alpha_ranks$county))
alpha_model_sum<-alpha_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(alpha_model_count, by="model") %>%
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)
delta_ranks <- fractional_ranks %>% filter(index > as.Date(delta_start) & index < as.Date(delta_end))
delta_daily_winner<-delta_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
delta_county_by_model <-delta_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
delta_winner<- delta_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
delta_model_count<- delta_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
delta_date_range<-length(unique(delta_ranks$index))
num_counties<-length(unique(delta_ranks$county))
delta_model_sum<-delta_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(delta_model_count, by="model") %>%
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)
omicron_ranks <- fractional_ranks %>% filter(index > as.Date(omicron_start) & index < as.Date(omicron_end-delay))
omicron_daily_winner<-omicron_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
omicron_county_by_model <-omicron_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
omicron_winner<- omicron_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
omicron_model_count<- omicron_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
omicron_date_range<-length(unique(omicron_ranks$index))
num_counties<-length(unique(omicron_ranks$county))
omicron_model_sum<-omicron_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(omicron_model_count, by="model") %>%
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)
Alpha period: Plot maps of best performing models by county
MAE_map_alpha<-alpha_winner %>% left_join(county_fips, by="county") %>%
left_join(urbnmapr::counties, by = "county_fips") %>%
filter(state_name =="California") %>%
ggplot(mapping = aes(long, lat, group = group, fill = model)) +
geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_winner$model)] )+
coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
guides(fill = "none") +
theme_classic()+
xlab(NULL)+ ylab(NULL)+
theme(axis.line = element_blank(),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
#version 1- histogram of winning models by county
MAE_hist_v2<-alpha_winner %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_winner$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Count")+
# guides(fill = "none") +
theme_classic()
#version 2- histogram of normalized ranking scores
MAE_hist<-alpha_model_sum %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_model_sum$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Normalized Rank")+
# guides(fill = "none") +
theme_classic()
MAE_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=model))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_daily_winner$model)] )+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
labs(fill = "Model:") +
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
participants_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of/Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
guides(fill="none")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
MAE_rankings_alpha<-alpha_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() +
geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+
geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+
facet_grid(forecaster ~.)+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ guides(fill="none")+ theme(strip.text = element_blank())
col2<- cowplot::plot_grid(MAE_map_alpha, MAE_rankings_alpha, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_alpha, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

alpha_winner %>% ungroup() %>% count(model) %>% kable
| CA-baseline |
26 |
| columbia |
4 |
| lemma |
13 |
| m.proj |
9 |
| simple |
4 |
alpha_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
| LEMMA |
0.6000000 |
0.6595111 |
| Columbia |
0.2000000 |
0.4191308 |
| Simple Growth |
0.5000000 |
0.5830530 |
| Ensemble |
0.6000000 |
0.6278788 |
| COVID Nearterm |
0.6666667 |
0.7081481 |
| CA-baseline |
0.8000000 |
0.7731061 |
Whole Delta period: Plot maps of best performing models by county for
MAE_map_delta<-delta_winner %>% left_join(county_fips, by="county") %>%
left_join(urbnmapr::counties, by = "county_fips") %>%
filter(state_name =="California") %>%
ggplot(mapping = aes(long, lat, group = group, fill = model)) +
geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_winner$model)] )+
coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
guides(fill = "none") +
theme_classic()+
xlab(NULL)+ ylab(NULL)+
theme(axis.line = element_blank(),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
#version 1- histogram of winning models by county
MAE_hist<-delta_winner %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_winner$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Count")+
guides(fill = "none") +
theme_classic()
#version 2- histogram of normalized ranking scores
MAE_hist<-delta_model_sum %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_model_sum$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Normalized Rank")+
guides(fill = "none") +
theme_classic()
MAE_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=model))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_daily_winner$model)] )+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
labs(fill = "Model:") +
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
participants_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of/Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
labs(fill = "Model:") +
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
MAE_rankings_delta<-delta_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() +
geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+
geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+
facet_grid(forecaster ~.)+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ guides(fill="none")+ theme(strip.text = element_blank())
col2<- cowplot::plot_grid(MAE_map_delta, MAE_rankings_delta, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_delta, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

delta_winner %>% ungroup() %>% count(model) %>% kable
| CA-baseline |
14 |
| columbia |
2 |
| covid_nearterm |
4 |
| lemma |
12 |
| m.proj |
11 |
| simple |
12 |
delta_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
| LEMMA |
0.6666667 |
0.6612781 |
| Columbia |
0.3333333 |
0.4109773 |
| COVID Nearterm |
0.6666667 |
0.6267667 |
| Simple Growth |
0.6000000 |
0.5956837 |
| Ensemble |
0.6666667 |
0.6618108 |
| CA-baseline |
0.6666667 |
0.6488174 |
Omicron: Plot maps of best performing models by count
MAE_map_omicron<-omicron_winner %>% left_join(county_fips, by="county") %>%
left_join(urbnmapr::counties, by = "county_fips") %>%
filter(state_name =="California") %>%
ggplot(mapping = aes(long, lat, group = group, fill = model)) +
geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_winner$model)] )+
coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
guides(fill = "none") +
theme_classic()+
xlab(NULL)+ ylab(NULL)+
theme(axis.line = element_blank(),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
#version 1- histogram of winning models by county
MAE_hist<-omicron_winner %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_winner$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Count")+
guides(fill = "none") +
theme_classic()
#version 2- histogram of normalized ranking scores
MAE_hist<-omicron_model_sum %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_model_sum$model)] )+
scale_x_discrete(name="Model Type", labels= NULL)+
ylab("Normalized Rank")+
guides(fill = "none") +
theme_classic()
MAE_heatmap_omicron<-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=model))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_daily_winner$model)] )+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
guides(fill=guide_legend(title.position="left", nrow=1))+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
participants_heatmap_omicron <-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
guides(fill=guide_legend(title.position="left", nrow=2))+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
MAE_rankings_omicron<-omicron_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>% drop_na() %>%
mutate(med = median(fraction_rank, na.rm=TRUE), avg= mean(fraction_rank, na.rm=TRUE)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() +
geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+
geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+
facet_grid(forecaster ~., scale="free")+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(omicron_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(omicron_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+
guides(fill="none")+
theme(strip.text = element_blank())
# row1<-cowplot::plot_grid(participants_heatmap_omicron, MAE_heatmap_omicron, labels=c("A", "B"), ncol=2, align="hv", axis = "tb")
# row2<-cowplot::plot_grid(MAE_map_omicron, MAE_rankings_omicron, labels=c("C", "D"), ncol=2, align="hv", axis = "l")
# cowplot::plot_grid(row1, row2, ncol=1, align="hv", axis= "tb", rel_heights = c(2.5,1))
# cowplot::plot_grid(MAE_map_omicron, MAE_hist, ncol=2, axis = "l")
col2<- cowplot::plot_grid(MAE_map_omicron, MAE_rankings_omicron, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_omicron, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

# cowplot::plot_grid(participants_heatmap_omicron, MAE_heatmap_omicron, col2, ncol=3, labels=c("A", "B", ""), axis = "l", rel_widths = c(1,1, .5))
omicron_winner %>% ungroup() %>% count(model) %>% kable
| CA-baseline |
18 |
| covid_nearterm |
14 |
| m.proj |
18 |
| simple |
5 |
omicron_ranks %>% left_join(regions, by= "county") %>% group_by(forecaster) %>%
mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
| Columbia |
NA |
NA |
| COVID Nearterm |
0.7500000 |
0.7523077 |
| Simple Growth |
0.6666667 |
0.5726212 |
| Ensemble |
0.6666667 |
0.6626515 |
| LEMMA |
0.4000000 |
0.3891304 |
| CA-baseline |
0.6666667 |
0.6514545 |
Number of forecasts participants by date
participants_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("County")+
#guides(fill="none")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
legend <- cowplot::get_legend(
# create some space to the left of the legend
participants_heatmap_alpha
)
participants_heatmap_alpha<- participants_heatmap_alpha+ guides(fill="none")
participants_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of/Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("")+
guides(fill="none")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+ ggtitle("Best daily performing model by county as measured by MAE")
participants_heatmap_omicron <-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index), fill=as.factor(models_per_day)))+
geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
scale_fill_manual(name = "Number of Available Forecasts", values=num_forecasts_col)+
scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
ylab("")+
# guides(fill=guide_legend(title.position="left", nrow=2))+
guides(fill="none")+
theme_classic()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "right")#+ ggtitle("Best daily performing model by county as measured by MAE")
# cowplot::plot_grid(participants_heatmap_alpha, participants_heatmap_delta, participants_heatmap_omicron, labels="AUTO", ncol=3, align="hv", axis = "tb")
row1<- cowplot::plot_grid(participants_heatmap_alpha, participants_heatmap_delta, participants_heatmap_omicron, labels="AUTO", ncol=3, align="hv", axis = "tb")
cowplot::plot_grid(row1, legend, ncol=1, rel_heights=c(3,0.5))

Train Random Forest Classification
## Random Forest
##
## 13741 samples
## 13 predictor
## 6 classes: 'simple', 'lemma', 'CA-baseline', 'm.proj', 'covid_nearterm', 'columbia'
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (4 fold, repeated 4 times)
## Summary of sample sizes: 10306, 10305, 10307, 10305, 10306, 10306, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.6660359 0.5725530
## 7 0.6780623 0.5901373
## 13 0.6687472 0.5780590
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.

## rf variable importance
##
## Overall
## perc_COVIDvx 100.0000
## r_eff 71.1164
## diff 63.5219
## pop_size 38.8639
## delta_frac 30.2719
## other_frac 21.3506
## alpha_frac 19.7568
## gamma_frac 10.0062
## income 2.5827
## perc_unemploy 2.1010
## education 1.2531
## omicron_frac 0.6508
## perc_pov 0.0000
Relative error
score_cards_tourney_rel <- score_cards_tourney %>% mutate(relative_error= predicted-actual, norm_relative_error= relative_error*100/hosp_capacity)
rel_error_7day<- score_cards_tourney_rel %>% filter(ahead==7) %>% mutate(variant_period=
case_when(index > as.Date(alpha_start) & index < as.Date(alpha_end) ~"Alpha",
index > as.Date(delta_start) & index < as.Date(delta_end) ~"Delta",
index > as.Date(omicron_start) & index < as.Date(omicron_end) ~"Omicron")) #%>% drop_na(variant_period)
rel_error_14day<- score_cards_tourney_rel %>% filter(ahead==14) %>% mutate(variant_period=
case_when(index > as.Date(alpha_start) & index < as.Date(alpha_end) ~"Alpha",
index > as.Date(delta_start) & index < as.Date(delta_end) ~"Delta",
index > as.Date(omicron_start) & index < as.Date(omicron_end) ~"Omicron")) %>% drop_na(variant_period)
rel_error_21day<- score_cards_tourney_rel %>% filter(ahead==21) %>% mutate(variant_period=
case_when(index > as.Date(alpha_start) & index < as.Date(alpha_end) ~"Alpha",
index > as.Date(delta_start) & index < as.Date(delta_end) ~"Delta",
index > as.Date(omicron_start) & index < as.Date(omicron_end) ~"Omicron")) %>% drop_na(variant_period)
rel_error<- rel_error_7day %>% bind_rows(rel_error_14day) %>% bind_rows(rel_error_21day)
alpha_rel_error_7day <- rel_error_7day %>% filter(index > as.Date(alpha_start) & index < as.Date(alpha_end)) %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)]) +
theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ theme(strip.text = element_blank())+ ggtitle("Alpha: 7-Day Relative Error")
delta_rel_error_7day <- rel_error_7day %>% filter(index > as.Date(delta_start) & index < as.Date(delta_end)) %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_rel_error_7day$forecaster)]) +
theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ theme(strip.text = element_blank())+ ggtitle("Delta: 7-Day Relative Error")
omicron_rel_error_14day <- rel_error_14day %>% filter(index > as.Date(omicron_start) & index < as.Date(omicron_end)) %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_14day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_14day$forecaster)]) +
theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ theme(strip.text = element_blank())+ ggtitle("Omicron: 14-Day Relative Error")
rel_error_7day %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)]) +
theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ geom_vline(xintercept= 0, lty="dashed") + ggtitle("7-Day Relative Error")

rel_error_7day %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+ facet_wrap(county ~.)+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)]) +
theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ geom_vline(xintercept= 0, lty="dashed") + ggtitle("7-Day Relative Error")

rel_error_14day %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+ facet_wrap(variant_period~.)+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)]) +
theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ geom_vline(xintercept= 0, lty="dashed") + ggtitle("14-Day Relative Error")

rel_error_21day %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_rel_error_7day$forecaster)]) +
theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ geom_vline(xintercept= 0, lty="dashed") + ggtitle("21-Day Relative Error")

rel_error %>% drop_na(variant_period) %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ facet_grid(variant_period~ ahead, scale="free")+ geom_density_ridges(rel_min_height = 0.01) + geom_vline(xintercept=0, lty="dashed")+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)]) +
theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ xlim(-75,200)

A7day<-rel_error_7day %>% mutate(forecast_date=as.Date(forecast_date)) %>% group_by(forecast_date, forecaster) %>% summarise(sd=sd(norm_relative_error, na.rm=TRUE), norm_relative_error=mean(norm_relative_error, na.rm=TRUE), ymin= norm_relative_error-sd, ymax= norm_relative_error+sd) %>% ggplot() + geom_path(aes(x=forecast_date, y=norm_relative_error, col= forecaster))+ geom_ribbon(aes(x= forecast_date, ymin=ymin, ymax=ymax, fill= forecaster), alpha=0.2)+ theme_classic()+ facet_wrap(forecaster~.)+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)])+
scale_color_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)])+ xlab("Publication Date")+ ylab("Normalized Relative Error (%)") + geom_hline(yintercept= 0, lty="dashed")+ guides(fill="none",col="none")
B14day<-rel_error_14day %>% mutate(forecast_date=as.Date(forecast_date)) %>% group_by(forecast_date, forecaster) %>% summarise(sd=sd(norm_relative_error, na.rm=TRUE), norm_relative_error=mean(norm_relative_error, na.rm=TRUE), ymin= norm_relative_error-sd, ymax= norm_relative_error+sd) %>% ggplot() + geom_path(aes(x=forecast_date, y=norm_relative_error, col= forecaster))+ geom_ribbon(aes(x= forecast_date, ymin=ymin, ymax=ymax, fill= forecaster), alpha=0.2)+ theme_classic()+ facet_wrap(forecaster~.)+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)])+
scale_color_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)])+ xlab("Publication Date")+ ylab("Normalized Relative Error (%)") + geom_hline(yintercept= 0, lty="dashed")
C21day<-rel_error_21day %>% mutate(forecast_date=as.Date(forecast_date)) %>% group_by(forecast_date, forecaster) %>% summarise(sd=sd(norm_relative_error, na.rm=TRUE), norm_relative_error=mean(norm_relative_error, na.rm=TRUE), ymin= norm_relative_error-sd, ymax= norm_relative_error+sd) %>% ggplot() + geom_path(aes(x=forecast_date, y=norm_relative_error, col= forecaster))+ geom_ribbon(aes(x= forecast_date, ymin=ymin, ymax=ymax, fill= forecaster), alpha=0.2)+ theme_classic()+ facet_wrap(forecaster~.)+
scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)])+
scale_color_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)])+ xlab("Publication Date")+ ylab("Normalized Relative Error (%)")+ geom_hline(yintercept= 0, lty="dashed") + guides(fill="none",col="none")
cowplot::plot_grid(A7day, B14day, C21day, ncol=1, labels= "AUTO", align = "v", axis = "lr")

Plot RF results
SI Figure 23
cowplot::plot_grid(var_imp_7day, var_imp_14day, var_imp_21day, labels="AUTO", ncol= 1)

Compare ensemble performance across counties
SI Figure 24
library(forcats)
library(equatiomatic)
ensemble_scores<- score_cards_tourney %>% filter(forecaster=="Ensemble", ahead==1) %>% left_join(regions, by="county")
#Individual counties colored by region x-axis ordered by county population size
#7 day
A_7day<- ensemble_scores %>% ggplot(aes(x=fct_reorder(county, dof_pop_county), y=norm_MAE_7day)) +
geom_boxplot(aes(fill=region_acronym)) +
scale_y_continuous(trans='log10') +
# facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
theme_classic()+ ylab("Normalized 7-day MAE (log scale)")+ xlab("County (in order of increasing population size)")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "right") + geom_smooth(method="lm", se=TRUE, aes(group=1), col="black", linetype="dashed") #+ #guides(fill="none") #+
legend <- cowplot::get_legend(
# create some space to the left of the legend
A_7day
)
A_7day<- A_7day+ guides(fill="none")
#14 day
B_14day <- ensemble_scores %>% ggplot(aes(x=fct_reorder(county, dof_pop_county), y=norm_MAE_14day)) +
geom_boxplot(aes(fill=region_acronym)) +
scale_y_continuous(trans='log10') +
# facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
theme_classic()+ ylab("Normalized 14-day MAE (log scale)")+ xlab("County (in order of increasing population size)")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + geom_smooth(method="lm", se=TRUE, aes(group=1), col="black", linetype="dashed") + guides(fill="none")
#21 day
C_21day<- ensemble_scores %>% ggplot(aes(x=fct_reorder(county, dof_pop_county), y=norm_MAE_21day)) +
geom_boxplot(aes(fill=region_acronym)) +
scale_y_continuous(trans='log10') +
# facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
theme_classic()+ ylab("Normalized 21-day MAE (log scale)")+ xlab("County (in order of increasing population size)")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + geom_smooth(method="lm", se=TRUE, aes(group=1), col="black", linetype="dashed") + guides(fill="none")
cowplot::plot_grid(A_7day, B_14day, C_21day, legend, labels=c("A", "B", "C", ""), ncol=2)

#Individual counties colored by region x-axis ordered by median norm_MAE
ensemble_scores %>% group_by(county) %>% mutate(median_norm_MAE=median(norm_MAE_14day)) %>% ggplot(aes(x=fct_reorder(county, median_norm_MAE), y=norm_MAE_14day)) +
geom_boxplot(aes(fill=region_acronym)) +
scale_y_continuous(trans='log10') +
# facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
theme_classic()+ ylab("Normalized Mean Absolute Error (log scale)")+ xlab("County (in order of median normalized MAE)")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + geom_smooth(method="lm", se=TRUE, aes(group=1), col="black", linetype="dashed") #+ #guides(fill="none") #+

ensemble_scores %>% ggplot(aes(x=fct_reorder(county, dof_pop_county), y=norm_MAE_14day, fill=region_acronym)) +
geom_boxplot() +
#scale_y_continuous(trans='log10') +
# facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
theme_classic()+ ylab("Normalized Mean Absolute Error")+ xlab("County (in order of increasing population size)")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + geom_smooth(method="lm", se=TRUE, aes(group=1), col="black", linetype="dashed") #+ #guides(fill="none") #+

#use log of norm_MAE because of increasing variance for smaller counties
county_lm<-lm(log(norm_MAE_14day) ~fct_reorder(county, dof_pop_county), data=ensemble_scores)
summary(county_lm)
##
## Call:
## lm(formula = log(norm_MAE_14day) ~ fct_reorder(county, dof_pop_county),
## data = ensemble_scores)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6581 -0.6717 -0.0346 0.6562 3.4809
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 1.458e+00 5.113e-02
## fct_reorder(county, dof_pop_county)Trinity -9.580e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Mono -1.895e-02 7.230e-02
## fct_reorder(county, dof_pop_county)Mariposa 7.935e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Inyo -1.894e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Plumas -4.762e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Colusa 5.116e-02 7.230e-02
## fct_reorder(county, dof_pop_county)Del Norte -3.674e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Glenn -2.328e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Lassen 5.908e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Amador -3.344e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Siskiyou -2.957e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Calaveras 6.350e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Tuolumne -2.806e-01 7.230e-02
## fct_reorder(county, dof_pop_county)San Benito 6.099e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Lake 5.188e-05 7.230e-02
## fct_reorder(county, dof_pop_county)Tehama 4.292e-02 7.230e-02
## fct_reorder(county, dof_pop_county)Yuba -4.802e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Mendocino -2.609e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Nevada -4.548e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Humboldt -9.072e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Napa -4.355e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Kings -5.922e-02 7.230e-02
## fct_reorder(county, dof_pop_county)Madera -9.468e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Shasta -1.022e+00 7.230e-02
## fct_reorder(county, dof_pop_county)Imperial -6.776e-01 7.230e-02
## fct_reorder(county, dof_pop_county)El Dorado 9.984e-02 7.230e-02
## fct_reorder(county, dof_pop_county)Butte -9.799e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Yolo 1.394e-02 7.230e-02
## fct_reorder(county, dof_pop_county)Marin -8.099e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Santa Cruz -8.012e-01 7.230e-02
## fct_reorder(county, dof_pop_county)San Luis Obispo -7.050e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Merced -2.327e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Placer -7.814e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Solano -1.015e+00 7.230e-02
## fct_reorder(county, dof_pop_county)Monterey -1.566e+00 7.230e-02
## fct_reorder(county, dof_pop_county)Santa Barbara -1.052e+00 7.230e-02
## fct_reorder(county, dof_pop_county)Tulare -7.310e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Sonoma -9.660e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Stanislaus -7.499e-01 7.230e-02
## fct_reorder(county, dof_pop_county)San Mateo -9.991e-01 7.230e-02
## fct_reorder(county, dof_pop_county)San Joaquin -9.727e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Ventura -1.188e+00 7.230e-02
## fct_reorder(county, dof_pop_county)San Francisco -2.103e+00 7.230e-02
## fct_reorder(county, dof_pop_county)Kern -7.714e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Fresno -6.523e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Contra Costa -1.125e+00 7.230e-02
## fct_reorder(county, dof_pop_county)Sacramento -1.314e+00 7.230e-02
## fct_reorder(county, dof_pop_county)Alameda -1.642e+00 7.230e-02
## fct_reorder(county, dof_pop_county)Santa Clara -1.666e+00 7.230e-02
## fct_reorder(county, dof_pop_county)San Bernardino -8.167e-01 7.230e-02
## fct_reorder(county, dof_pop_county)Riverside -1.239e+00 7.230e-02
## fct_reorder(county, dof_pop_county)Orange -1.421e+00 7.230e-02
## fct_reorder(county, dof_pop_county)San Diego -1.601e+00 7.230e-02
## fct_reorder(county, dof_pop_county)Los Angeles -1.301e+00 7.230e-02
## t value Pr(>|t|)
## (Intercept) 28.521 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Trinity -13.249 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Mono -0.262 0.793277
## fct_reorder(county, dof_pop_county)Mariposa 10.975 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Inyo -2.619 0.008816 **
## fct_reorder(county, dof_pop_county)Plumas -6.586 4.64e-11 ***
## fct_reorder(county, dof_pop_county)Colusa 0.708 0.479226
## fct_reorder(county, dof_pop_county)Del Norte -5.082 3.78e-07 ***
## fct_reorder(county, dof_pop_county)Glenn -3.220 0.001286 **
## fct_reorder(county, dof_pop_county)Lassen 8.171 3.24e-16 ***
## fct_reorder(county, dof_pop_county)Amador -4.625 3.77e-06 ***
## fct_reorder(county, dof_pop_county)Siskiyou -4.090 4.33e-05 ***
## fct_reorder(county, dof_pop_county)Calaveras 8.783 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Tuolumne -3.881 0.000104 ***
## fct_reorder(county, dof_pop_county)San Benito 8.435 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Lake 0.001 0.999427
## fct_reorder(county, dof_pop_county)Tehama 0.594 0.552825
## fct_reorder(county, dof_pop_county)Yuba -6.642 3.19e-11 ***
## fct_reorder(county, dof_pop_county)Mendocino -3.608 0.000309 ***
## fct_reorder(county, dof_pop_county)Nevada -6.291 3.23e-10 ***
## fct_reorder(county, dof_pop_county)Humboldt -12.547 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Napa -6.023 1.74e-09 ***
## fct_reorder(county, dof_pop_county)Kings -0.819 0.412741
## fct_reorder(county, dof_pop_county)Madera -13.095 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Shasta -14.133 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Imperial -9.372 < 2e-16 ***
## fct_reorder(county, dof_pop_county)El Dorado 1.381 0.167339
## fct_reorder(county, dof_pop_county)Butte -13.553 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Yolo 0.193 0.847101
## fct_reorder(county, dof_pop_county)Marin -11.202 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Santa Cruz -11.080 < 2e-16 ***
## fct_reorder(county, dof_pop_county)San Luis Obispo -9.750 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Merced -3.218 0.001291 **
## fct_reorder(county, dof_pop_county)Placer -10.808 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Solano -14.042 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Monterey -21.662 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Santa Barbara -14.553 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Tulare -10.110 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Sonoma -13.360 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Stanislaus -10.371 < 2e-16 ***
## fct_reorder(county, dof_pop_county)San Mateo -13.818 < 2e-16 ***
## fct_reorder(county, dof_pop_county)San Joaquin -13.453 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Ventura -16.437 < 2e-16 ***
## fct_reorder(county, dof_pop_county)San Francisco -29.092 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Kern -10.669 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Fresno -9.022 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Contra Costa -15.565 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Sacramento -18.171 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Alameda -22.709 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Santa Clara -23.048 < 2e-16 ***
## fct_reorder(county, dof_pop_county)San Bernardino -11.295 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Riverside -17.131 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Orange -19.657 < 2e-16 ***
## fct_reorder(county, dof_pop_county)San Diego -22.148 < 2e-16 ***
## fct_reorder(county, dof_pop_county)Los Angeles -17.991 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9551 on 19140 degrees of freedom
## (715 observations deleted due to missingness)
## Multiple R-squared: 0.2971, Adjusted R-squared: 0.2951
## F-statistic: 149.8 on 54 and 19140 DF, p-value: < 2.2e-16
equatiomatic::extract_eq(county_lm)
\[
\operatorname{log(norm\_MAE\_14day)} = \alpha + \beta_{1}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Trinity}}) + \beta_{2}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Mono}}) + \beta_{3}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Mariposa}}) + \beta_{4}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Inyo}}) + \beta_{5}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Plumas}}) + \beta_{6}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Colusa}}) + \beta_{7}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Del\ Norte}}) + \beta_{8}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Glenn}}) + \beta_{9}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Lassen}}) + \beta_{10}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Amador}}) + \beta_{11}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Siskiyou}}) + \beta_{12}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Calaveras}}) + \beta_{13}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Tuolumne}}) + \beta_{14}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Benito}}) + \beta_{15}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Lake}}) + \beta_{16}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Tehama}}) + \beta_{17}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Yuba}}) + \beta_{18}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Mendocino}}) + \beta_{19}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Nevada}}) + \beta_{20}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Humboldt}}) + \beta_{21}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Napa}}) + \beta_{22}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Kings}}) + \beta_{23}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Madera}}) + \beta_{24}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Shasta}}) + \beta_{25}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Imperial}}) + \beta_{26}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{El\ Dorado}}) + \beta_{27}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Butte}}) + \beta_{28}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Yolo}}) + \beta_{29}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Marin}}) + \beta_{30}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Santa\ Cruz}}) + \beta_{31}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Luis\ Obispo}}) + \beta_{32}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Merced}}) + \beta_{33}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Placer}}) + \beta_{34}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Solano}}) + \beta_{35}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Monterey}}) + \beta_{36}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Santa\ Barbara}}) + \beta_{37}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Tulare}}) + \beta_{38}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Sonoma}}) + \beta_{39}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Stanislaus}}) + \beta_{40}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Mateo}}) + \beta_{41}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Joaquin}}) + \beta_{42}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Ventura}}) + \beta_{43}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Francisco}}) + \beta_{44}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Kern}}) + \beta_{45}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Fresno}}) + \beta_{46}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Contra\ Costa}}) + \beta_{47}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Sacramento}}) + \beta_{48}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Alameda}}) + \beta_{49}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Santa\ Clara}}) + \beta_{50}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Bernardino}}) + \beta_{51}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Riverside}}) + \beta_{52}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Orange}}) + \beta_{53}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Diego}}) + \beta_{54}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Los\ Angeles}}) + \epsilon
\]
pop_lm<-lm(log(norm_MAE_14day) ~dof_pop_county, data=ensemble_scores)
summary(pop_lm)
##
## Call:
## lm(formula = log(norm_MAE_14day) ~ dof_pop_county, data = ensemble_scores)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4127 -0.7558 -0.0203 0.7335 4.5449
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.584e-01 8.838e-03 108.44 <2e-16 ***
## dof_pop_county -1.818e-07 5.258e-09 -34.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.104 on 19193 degrees of freedom
## (715 observations deleted due to missingness)
## Multiple R-squared: 0.05865, Adjusted R-squared: 0.0586
## F-statistic: 1196 on 1 and 19193 DF, p-value: < 2.2e-16
equatiomatic::extract_eq(pop_lm)
\[
\operatorname{log(norm\_MAE\_14day)} = \alpha + \beta_{1}(\operatorname{dof\_pop\_county}) + \epsilon
\]
#Grouped by regions
ensemble_scores %>% ggplot(aes(x=dof_pop_county, y=norm_MAE_14day)) +
geom_boxplot(aes(fill=region_acronym)) +
geom_smooth(method='lm')+
scale_y_continuous(trans='log10') +
scale_x_continuous(trans='log10') +
scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
# facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
#scale_fill_manual(name = "Health Officer Regions", values= met.brewer("Hokusai1", 5))+
theme_classic()+ xlab("Population Size")+ ylab("Normalized MAE")# + guides(fill="none") #+ theme(strip.text = element_blank())

#Individual counties with facet grid split by regions
ensemble_scores %>% ggplot(aes(x=norm_MAE_14day, y=fct_reorder(county, dof_pop_county), fill=region_acronym)) +
geom_boxplot() +
scale_x_continuous(trans='log10') +
geom_smooth(method='lm', orientation="y", aes(group=1), col="black", linetype="dashed", se=TRUE)+
facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
theme_classic()+ xlab("Normalized Mean Absolute Error")+ ylab("County")+ guides(fill="none") #+ theme(strip.text = element_blank())

max_models %>% group_by(county) %>% left_join(regions, by="county") %>% ggplot(aes(x=fct_reorder(county, dof_pop_county), y= models_per_day)) + geom_boxplot() + theme_classic()

Print session info
print(sessionInfo(), locale = TRUE)
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server 7.7 (Maipo)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] equatiomatic_0.3.1 doParallel_1.0.16 iterators_1.0.13
## [4] foreach_1.5.1 caret_6.0-90 lattice_0.20-38
## [7] ggridges_0.5.3 glue_1.6.2 magrittr_1.5
## [10] MetBrewer_0.1.0 urbnmapr_0.0.0.9002 viridis_0.6.2
## [13] viridisLite_0.4.0 readxl_1.3.1 rmarkdown_2.11
## [16] gridExtra_2.3 ggthemes_4.2.0 data.table_1.12.8
## [19] knitr_1.28 roll_1.1.6 lubridate_1.7.4
## [22] forcats_0.4.0 stringr_1.4.0 dplyr_1.0.9
## [25] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0
## [28] tibble_3.1.7 ggplot2_3.3.5 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 ellipsis_0.3.2 class_7.3-15
## [4] fs_1.3.2 rstudioapi_0.13 listenv_0.8.0
## [7] farver_2.0.3 bit64_0.9-7 prodlim_2019.11.13
## [10] fansi_0.4.1 xml2_1.3.2 codetools_0.2-16
## [13] splines_3.6.0 jsonlite_1.6.1 pROC_1.18.0
## [16] broom_0.8.0 dbplyr_1.4.4 shiny_1.4.0.2
## [19] mapproj_1.2.7 compiler_3.6.0 httr_1.4.1
## [22] backports_1.1.7 fastmap_1.0.1 assertthat_0.2.1
## [25] Matrix_1.2-17 cli_3.3.0 later_1.1.0.1
## [28] htmltools_0.4.0 tools_3.6.0 gtable_0.3.0
## [31] reshape2_1.4.3 maps_3.4.0 Rcpp_1.0.4.6
## [34] cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.4.1
## [37] nlme_3.1-139 timeDate_3043.102 gower_0.2.2
## [40] xfun_0.28 globals_0.14.0 rvest_0.3.5
## [43] mime_0.9 lifecycle_1.0.1 future_1.25.0
## [46] MASS_7.3-51.4 scales_1.1.1 ipred_0.9-12
## [49] vroom_1.5.7 promises_1.1.1 hms_0.5.3
## [52] yaml_2.2.1 rpart_4.1-15 stringi_1.4.6
## [55] highr_0.8 randomForest_4.6-14 e1071_1.7-0
## [58] hardhat_0.2.0 lava_1.6.10 rlang_1.0.2
## [61] pkgconfig_2.0.3 evaluate_0.14 recipes_0.2.0
## [64] labeling_0.3 cowplot_1.1.1 bit_1.1-14
## [67] tidyselect_1.1.2 parallelly_1.31.1 plyr_1.8.4
## [70] geojsonsf_2.0.0 R6_2.4.1 generics_0.1.2
## [73] DBI_1.1.0 mgcv_1.8-28 pillar_1.7.0
## [76] haven_2.2.0 withr_2.2.0 survival_2.44-1.1
## [79] nnet_7.3-12 future.apply_1.8.1 modelr_0.1.5
## [82] crayon_1.3.4 utf8_1.1.4 tzdb_0.3.0
## [85] blob_1.2.0 ModelMetrics_1.2.2.2 reprex_0.3.0
## [88] digest_0.6.25 xtable_1.8-4 httpuv_1.5.4
## [91] RcppParallel_5.0.1 stats4_3.6.0 munsell_0.5.0